xgboost airline example

Airlines all years 1987-2008: https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv (12 GB) Or if you want something bigger here is the 10x version: https://s3.amazonaws.com/h2o-airlines-unpacked/allyears_10.csv (120 GB) Or if you want to play with something smaller collection of 2000 rows from all years airline data set. For this example we will be using this this data set: https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv (4.5 MB)

In [13]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [14]:
df = pd.read_csv('https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv',encoding='iso8859_15')
df = df.dropna(axis=1)
X = df.drop('IsDepDelayed',axis=1)
y = df['IsDepDelayed']


/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (22) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [15]:
df.dtypes


Out[15]:
Year              int64
Month             int64
DayofMonth        int64
DayOfWeek         int64
CRSDepTime        int64
CRSArrTime        int64
UniqueCarrier    object
FlightNum         int64
Origin           object
Dest             object
Cancelled         int64
Diverted          int64
IsArrDelayed     object
IsDepDelayed     object
dtype: object

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])

        return output
    
    def inverse_transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.inverse_transform(data[column])

        return output
    

encoder = EncodeCategorical(['UniqueCarrier','Origin','Dest','IsArrDelayed'])
y_encoder = EncodeCategorical(['IsDepDelayed'])

In [17]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline


cols = ['Year', 'Month', 'DayofMonth','DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'Origin', 'Dest','IsDepDelayed']
df = df[cols]
xgb = XGBClassifier()
pipeline = Pipeline([
    ("label_encoder", encoder),
    ('Classifier',xgb)
])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [19]:
pipeline.fit(X_train,y_train)


Out[19]:
Pipeline(steps=[('label_encoder', EncodeCategorical(columns=['UniqueCarrier', 'Origin', 'Dest', 'IsArrDelayed'])), ('Classifier', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [20]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
predictions = pipeline.predict(X_test)

le = LabelEncoder()
le.fit(y_test)
print('accurancy: %f '% accuracy_score(le.transform(y_test), le.transform(predictions)))
print('auc: %f' % roc_auc_score(le.transform(y_test), le.transform(predictions)))


accurancy: 0.744202 
auc: 0.742402

In [21]:
print(xgb.feature_importances_)


[ 0.19642857  0.          0.08116883  0.04220779  0.08928572  0.09740259
  0.10714286  0.0762987   0.13311689  0.04707792  0.06493507  0.
  0.06493507]

In [22]:
from xgboost import plot_importance
%matplotlib inline
plot_importance(xgb)


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb13f5c4dd8>

In [23]:
X


Out[23]:
Year Month DayofMonth DayOfWeek CRSDepTime CRSArrTime UniqueCarrier FlightNum Origin Dest Cancelled Diverted IsArrDelayed
0 1987 10 14 3 730 849 PS 1451 SAN SFO 0 0 YES
1 1987 10 15 4 730 849 PS 1451 SAN SFO 0 0 YES
2 1987 10 17 6 730 849 PS 1451 SAN SFO 0 0 YES
3 1987 10 18 7 730 849 PS 1451 SAN SFO 0 0 NO
4 1987 10 19 1 730 849 PS 1451 SAN SFO 0 0 YES
5 1987 10 21 3 730 849 PS 1451 SAN SFO 0 0 NO
6 1987 10 22 4 730 849 PS 1451 SAN SFO 0 0 YES
7 1987 10 23 5 730 849 PS 1451 SAN SFO 0 0 YES
8 1987 10 24 6 730 849 PS 1451 SAN SFO 0 0 YES
9 1987 10 25 7 730 849 PS 1451 SAN SFO 0 0 YES
10 1987 10 26 1 730 849 PS 1451 SAN SFO 0 0 YES
11 1987 10 28 3 725 855 PS 1451 SAN SFO 0 0 YES
12 1987 10 29 4 725 855 PS 1451 SAN SFO 0 0 YES
13 1987 10 31 6 725 855 PS 1451 SAN SFO 0 0 NO
14 1987 10 1 4 915 1001 PS 1451 SFO RNO 0 0 YES
15 1987 10 2 5 915 1001 PS 1451 SFO RNO 0 0 YES
16 1987 10 3 6 915 1001 PS 1451 SFO RNO 0 0 YES
17 1987 10 4 7 915 1001 PS 1451 SFO RNO 0 0 YES
18 1987 10 5 1 915 1001 PS 1451 SFO RNO 0 0 YES
19 1987 10 6 2 915 1001 PS 1451 SFO RNO 0 0 YES
20 1987 10 7 3 915 1001 PS 1451 SFO RNO 0 0 YES
21 1987 10 8 4 915 1001 PS 1451 SFO RNO 0 0 YES
22 1987 10 9 5 915 1001 PS 1451 SFO RNO 0 0 YES
23 1987 10 10 6 915 1001 PS 1451 SFO RNO 0 0 YES
24 1987 10 11 7 915 1001 PS 1451 SFO RNO 0 0 YES
25 1987 10 12 1 915 1001 PS 1451 SFO RNO 0 0 YES
26 1987 10 13 2 915 1001 PS 1451 SFO RNO 0 0 YES
27 1987 10 14 3 915 1001 PS 1451 SFO RNO 0 0 YES
28 1987 10 15 4 915 1001 PS 1451 SFO RNO 0 0 YES
29 1987 10 17 6 915 1001 PS 1451 SFO RNO 0 0 YES
... ... ... ... ... ... ... ... ... ... ... ... ... ...
43948 2008 1 3 4 930 1025 WN 324 SJC RNO 0 0 NO
43949 2008 1 3 4 2035 2130 WN 1209 SJC RNO 0 0 YES
43950 2008 1 3 4 1645 1740 WN 2645 SJC RNO 0 0 YES
43951 2008 1 3 4 1125 1220 WN 2969 SJC RNO 0 0 YES
43952 2008 1 3 4 1025 1140 WN 98 SJC SAN 0 0 YES
43953 2008 1 3 4 2115 2230 WN 390 SJC SAN 0 0 YES
43954 2008 1 3 4 1925 2045 WN 532 SJC SAN 0 0 YES
43955 2008 1 3 4 1320 1435 WN 580 SJC SAN 0 0 YES
43956 2008 1 3 4 1805 1925 WN 722 SJC SAN 0 0 YES
43957 2008 1 3 4 2005 2120 WN 943 SJC SAN 0 0 YES
43958 2008 1 3 4 750 905 WN 2203 SJC SAN 0 0 NO
43959 2008 1 3 4 1535 1650 WN 2268 SJC SAN 0 0 YES
43960 2008 1 3 4 1125 1240 WN 2878 SJC SAN 0 0 YES
43961 2008 1 3 4 635 750 WN 2992 SJC SAN 0 0 YES
43962 2008 1 3 4 755 1005 WN 152 SJC SEA 0 0 NO
43963 2008 1 3 4 1300 1510 WN 635 SJC SEA 0 0 YES
43964 2008 1 3 4 1830 2040 WN 1600 SJC SEA 0 0 YES
43965 2008 1 3 4 1450 1700 WN 2965 SJC SEA 0 0 NO
43966 2008 1 3 4 1950 2105 WN 279 SJC SNA 1 0 YES
43967 2008 1 3 4 700 815 WN 305 SJC SNA 0 0 NO
43968 2008 1 3 4 1135 1250 WN 786 SJC SNA 0 0 NO
43969 2008 1 3 4 2040 2155 WN 824 SJC SNA 0 0 NO
43970 2008 1 3 4 1340 1455 WN 1002 SJC SNA 0 0 YES
43971 2008 1 3 4 1545 1700 WN 2028 SJC SNA 0 0 YES
43972 2008 1 3 4 925 1040 WN 2648 SJC SNA 0 0 NO
43973 2008 1 3 4 1740 1855 WN 3411 SJC SNA 0 0 YES
43974 2008 1 3 4 1520 1650 WN 2969 SLC ABQ 0 0 YES
43975 2008 1 3 4 855 1025 WN 3234 SLC ABQ 0 0 NO
43976 2008 1 3 4 1625 1730 WN 1022 SLC BOI 0 0 YES
43977 2008 1 3 4 1030 1140 WN 1041 SLC BOI 0 0 YES

43978 rows × 13 columns


In [ ]:


In [ ]: